import yaml
import pandas as pd
from functools import reduce
import numpy as np
def get_config():
with open("config.yaml", 'r') as stream:
config = yaml.safe_load(stream)
return config
config = get_config()
filepath = (config['datapath_as4'])
if not filepath.endswith('.csv'):
raise Exception("incorrect file format")
df = pd.read_csv(filepath)
print("Percentage of missing data for each ariable:\n", 1-df.count()/float(df.shape[0]))
independent_vars = ['PhoneReach', 'PhoneTime', 'Tired', 'Breakfast'] # because they cause an effect
dependent_vars = ['Hours', 'Enough'] # these parameters are effected
print(f'Dependent vars: {dependent_vars}')
print(f'Independent vars: {independent_vars}')
df = df.dropna()
df['Hours'] = pd.to_numeric(df['Hours']).astype('int')
# validate Tired variable values
def limitTiredValues(row):
if row['Tired'] > 5:
return 5
elif row['Tired'] < 1:
return 1
return row['Tired']
df['Tiredness_value'] = df.apply(lambda row: limitTiredValues(row), axis=1)
df = df.drop('Tired', axis=1)
def convertYesNoToBool(row, column_name):
if row[column_name] == 'Yes':
return True
elif row[column_name] == 'No':
return False
raise Exception("incorrect patient answer")
def covertToBoolColumn(old_column_name, new_column_name):
global df
df[new_column_name] = df.apply(lambda row: convertYesNoToBool(row, old_column_name), axis=1)
df = df.drop(old_column_name, axis=1)
covertToBoolColumn('Enough', 'EnoughSleep')
covertToBoolColumn('PhoneReach', 'InPhoneReach')
covertToBoolColumn('PhoneTime', 'UsedPhoneBeforeSleep')
# covertToBoolColumn('Breakfast', 'HadBreakfast')
df.info()
Percentage of missing data for each ariable: Enough 0.000000 Hours 0.019231 PhoneReach 0.000000 PhoneTime 0.000000 Tired 0.000000 Breakfast 0.000000 dtype: float64 Dependent vars: ['Hours', 'Enough'] Independent vars: ['PhoneReach', 'PhoneTime', 'Tired', 'Breakfast'] <class 'pandas.core.frame.DataFrame'> Int64Index: 102 entries, 0 to 103 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Hours 102 non-null int64 1 Breakfast 102 non-null object 2 Tiredness_value 102 non-null int64 3 EnoughSleep 102 non-null bool 4 InPhoneReach 102 non-null bool 5 UsedPhoneBeforeSleep 102 non-null bool dtypes: bool(3), int64(2), object(1) memory usage: 3.5+ KB
def hadBreakfast(row):
if row['Breakfast'] == 'Yes':
return "had breakfast"
return "no breakfast"
df['Breakfast'] = df.apply(lambda row: hadBreakfast(row), axis=1)
df.head()
| Hours | Breakfast | Tiredness_value | EnoughSleep | InPhoneReach | UsedPhoneBeforeSleep | |
|---|---|---|---|---|---|---|
| 0 | 8 | had breakfast | 3 | True | True | True |
| 1 | 6 | no breakfast | 3 | False | True | True |
| 2 | 6 | had breakfast | 2 | True | True | True |
| 3 | 7 | no breakfast | 4 | False | True | True |
| 4 | 7 | had breakfast | 2 | False | True | True |
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CategoricalColorMapper
from bokeh.io import output_notebook
from bokeh.resources import INLINE
output_notebook(INLINE)
source = ColumnDataSource(
data = {'Hours': df['Hours'], 'Tiredness_value': df['Tiredness_value'], 'Breakfast': df['Breakfast']}
)
palette=['green', 'red']
mapper = CategoricalColorMapper( factors=['had breakfast', 'no breakfast'], palette=palette)
pl = figure(plot_width=800, plot_height=400, title='The relation between being tired and hours of sleep', x_axis_label='Hours of sleep', y_axis_label='Tiredness value')
pl.scatter('Hours', 'Tiredness_value', source=source, size=9, alpha = 0.3, color={'field': 'Breakfast', 'transform': mapper}, legend_field='Breakfast')
show(pl)
mean = df[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters:\n{mean}\n')
std = df[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters:\n{std}\n')
df_no_breakfast = df[df['Breakfast'] == 'no breakfast']
df_had_breakfast = df[df['Breakfast'] == 'had breakfast']
mean = df_had_breakfast[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters (had breakfast):\n{mean}\n')
std = df_had_breakfast[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters (had breakfast):\n{std}\n')
mean = df_no_breakfast[['Hours', 'Tiredness_value']].mean(numeric_only=True)
print(f'Mean of target parameters (no breakfast):\n{mean}\n')
std = df_no_breakfast[['Hours', 'Tiredness_value']].std(numeric_only=True)
print(f'Standart deviation of target parameters (no breakfast):\n{std}')
# easy way to watch statistics
df.describe()
# we can see that patients who had breakfast sleep more hours on the average (6.9h)
# compared to patients who didn't have breakfast (6.2h)
# patients who had breakfast are less tired on the average (2.8)
# compared to patients who didn't have breakfast (3.3)
Mean of target parameters: Hours 6.656863 Tiredness_value 3.088235 dtype: float64 Standart deviation of target parameters: Hours 1.417676 Tiredness_value 1.015747 dtype: float64 Mean of target parameters (had breakfast): Hours 6.918033 Tiredness_value 2.885246 dtype: float64 Standart deviation of target parameters (had breakfast): Hours 1.268793 Tiredness_value 0.950410 dtype: float64 Mean of target parameters (no breakfast): Hours 6.268293 Tiredness_value 3.390244 dtype: float64 Standart deviation of target parameters (no breakfast): Hours 1.549587 Tiredness_value 1.045898 dtype: float64
| Hours | Tiredness_value | |
|---|---|---|
| count | 102.000000 | 102.000000 |
| mean | 6.656863 | 3.088235 |
| std | 1.417676 | 1.015747 |
| min | 2.000000 | 1.000000 |
| 25% | 6.000000 | 2.000000 |
| 50% | 7.000000 | 3.000000 |
| 75% | 7.000000 | 4.000000 |
| max | 10.000000 | 5.000000 |
from bokeh.models import BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter, FixedTicker
source1_had_breakfast = ColumnDataSource(
data = {'Hours': df_had_breakfast['Hours'], 'Tiredness_value': df_had_breakfast['Tiredness_value'], 'Breakfast': df_had_breakfast['Breakfast']}
)
source2_no_breakfast = ColumnDataSource(
data = {'Hours': df_no_breakfast['Hours'], 'Tiredness_value': df_no_breakfast['Tiredness_value'], 'Breakfast': df_no_breakfast['Breakfast']}
)
p = figure(title="The relation between being tired and hours of sleep for patients who had breakfast",
x_axis_location="above", width=800, height=400,
toolbar_location='below',
x_axis_label='Hours of sleep', y_axis_label='Tiredness value')
p.rect(x="Hours", y="Tiredness_value", width=1, height=1,
source=source1_had_breakfast,
fill_color={'field': 'Breakfast', 'transform': mapper},
line_color=None, alpha=0.2)
p2 = figure(title="The relation between being tired and hours of sleep for patients who didn't have breakfast",
x_axis_location="above", width=800, height=400,
toolbar_location='below',
x_axis_label='Hours of sleep', y_axis_label='Tiredness value')
p2.rect(x="Hours", y="Tiredness_value", width=1, height=1,
source=source2_no_breakfast,
fill_color={'field': 'Breakfast', 'transform': mapper},
line_color=None, alpha=0.2)
# the majority of patients who had breafast sleeps 6-8 hours compared to patients who didn't have breafast
# with an average sleep of 5-7 hours
show(p)
show(p2)
from scipy.stats import norm
arr_hist, edges=np.histogram(df['Hours'], bins=8,range=[2,10], density='True')
hours = pd.DataFrame({'Hours': arr_hist, 'left': edges[:-1], 'right': edges[1:]})
p = figure(plot_height = 800, plot_width = 800, title = 'Hours of sleep distribution',x_axis_label = 'Hours of sleep', y_axis_label = 'Number of patients')
p.quad(bottom=0, top=hours['Hours'],
left=hours['left'], right=hours['right'],
fill_color='green', alpha=0.5, line_color='black')
# average
mean_hours_value = df[['Hours']].mean(numeric_only=True)
p.line(np.full(2,mean_hours_value), [0, hours['Hours'].max()], legend_label="Mean value", line_width=4, color='red')
#rubust estimation
median_hours_value = df[['Hours']].median(numeric_only=True)
p.line(np.full(2,median_hours_value), [0, hours['Hours'].max()], legend_label="Median value", line_width=4, color='black')
# normal distribution line
x = np.linspace(2, 10, 200)
mu_MM = mean_hours_value
sigma2_MM = df[['Hours']].var(numeric_only=True)
sigma_MM = np.sqrt(sigma2_MM)
rv = np.array([norm.pdf(xi, loc = mu_MM, scale = sigma_MM) for xi in x])
p.line(x, rv.ravel(), legend_label="Normal distribution line", line_width=4, color='blue')
# data is approximately normally distributed
show(p)
from scipy import stats
shapiro_test = stats.shapiro(df[['Hours']])
# the p-value is 7.15833084541373e-05 which is less than the alpha(0.05). It means that we have
# sufficient evidence to say that sample does not come from a normal distribution.
shapiro_test
ShapiroResult(statistic=0.93398118019104, pvalue=7.15833084541373e-05)
import matplotlib.pyplot as plt
import seaborn as sns
fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Tiredness_value', y='Hours', data=df, color='#ffccff')
ax = sns.swarmplot(x="Tiredness_value", y="Hours", data=df, color='#660066', size=4)
plt.show()
fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Breakfast', y='Hours', data=df, color='#ffccff')
ax = sns.swarmplot(x="Breakfast", y="Hours", data=df, color='#660066', size=4)
plt.show()
from scipy.stats import levene
groups_by_value_of_tiredness = df.groupby("Tiredness_value")
group_hours_1 = groups_by_value_of_tiredness.get_group(1)['Hours']
group_hours_2 = groups_by_value_of_tiredness.get_group(2)['Hours']
group_hours_3 = groups_by_value_of_tiredness.get_group(3)['Hours']
group_hours_4 = groups_by_value_of_tiredness.get_group(4)['Hours']
group_hours_5 = groups_by_value_of_tiredness.get_group(5)['Hours']
# P (0.0018190082469875675)< 0.05 - varience differs (plot shows the same result)
print("Levene test(hours~tiredness):\n", levene(group_hours_1, group_hours_2, group_hours_3, group_hours_4, group_hours_5))
groups_by_breakfast = df.groupby("Breakfast")
group_hours_yes = groups_by_breakfast.get_group("had breakfast")['Hours']
group_hours_no = groups_by_breakfast.get_group("no breakfast")['Hours']
# P (0.3270310216888861)> 0.05 - varience is equal
print("Levene test(hours~breakfast):\n", levene(group_hours_yes, group_hours_no))
Levene test(hours~tiredness): LeveneResult(statistic=4.634534448505077, pvalue=0.0018190082469875675) Levene test(hours~breakfast): LeveneResult(statistic=0.9701052432875272, pvalue=0.3270310216888861)
def groupTirednessValues(row):
if row['Tiredness_value'] < 3:
return 'no'
elif row['Tiredness_value'] > 3:
return 'yes'
return 'maybe'
df['Tiredness_value_answer'] = df.apply(lambda row: groupTirednessValues(row), axis=1)
groupedByTireness = df.groupby('Tiredness_value_answer')
yes_group = groupedByTireness.get_group('yes')
no_group = groupedByTireness.get_group('no')
maybe_group = groupedByTireness.get_group('maybe')
common_size = min(len(yes_group), len(no_group),len(maybe_group))
equal_samle_sized_df = yes_group.head(common_size)
equal_samle_sized_df = pd.concat([equal_samle_sized_df, maybe_group.head(common_size), no_group.head(common_size)], ignore_index=True)
equal_samle_sized_df
| Hours | Breakfast | Tiredness_value | EnoughSleep | InPhoneReach | UsedPhoneBeforeSleep | Tiredness_value_answer | |
|---|---|---|---|---|---|---|---|
| 0 | 7 | no breakfast | 4 | False | True | True | yes |
| 1 | 7 | no breakfast | 4 | False | True | True | yes |
| 2 | 10 | no breakfast | 4 | False | True | True | yes |
| 3 | 6 | had breakfast | 4 | True | True | True | yes |
| 4 | 2 | no breakfast | 5 | False | True | True | yes |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 85 | 9 | had breakfast | 2 | True | False | False | no |
| 86 | 7 | had breakfast | 2 | True | False | True | no |
| 87 | 7 | no breakfast | 2 | True | True | True | no |
| 88 | 7 | had breakfast | 2 | False | True | True | no |
| 89 | 7 | had breakfast | 2 | True | True | True | no |
90 rows × 7 columns
# one-way ANOVA: hours - dependent vars, categories of tiredness - categorical vars
groups_frame = pd.DataFrame({"Hours":equal_samle_sized_df['Hours'],"Tiredness":equal_samle_sized_df['Tiredness_value_answer']})
groups_by_category_of_tiredness = groups_frame.groupby("Tiredness")
yes_group_hours = groups_by_category_of_tiredness.get_group('yes')['Hours']
no_group_hours = groups_by_category_of_tiredness.get_group('no')['Hours']
maybe_group_hours = groups_by_category_of_tiredness.get_group('maybe')['Hours']
# P(0.3679482317252606) > 0.05. It means that here is no significant effect on sleep duration
print("one-way ANOVA:\n", stats.f_oneway(yes_group_hours, no_group_hours, maybe_group_hours), "\n")
one-way ANOVA: F_onewayResult(statistic=1.0113915177006665, pvalue=0.3679482317252606)
import statsmodels.api as sm
from statsmodels.formula.api import ols
# two-way ANOVA: hours - dependent vars; categories of tiredness, breakfast - categorical vars
groups_frame = pd.DataFrame({"Hours":equal_samle_sized_df['Hours'],
"Tiredness":equal_samle_sized_df['Tiredness_value_answer'],
'Breakfast': equal_samle_sized_df['Breakfast']})
model = ols('Hours ~ C(Tiredness) + C(Breakfast) + C(Tiredness):C(Breakfast)', data=groups_frame).fit()
# P for C(Tiredness):C(Breakfast) > 0.05 - there is no significant interaction effect between Tiredness and Breakfast
# Both factors have no statistically significant effect on sleep duration (their p-values > 0.05)
print("two-way ANOVA:\n", sm.stats.anova_lm(model, type=2), "\n")
# p-value > 0.05 - data is drawn from normal distribution
print("Shapiro test\n", stats.shapiro(model.resid))
# As the standardized residuals lie around the 45-degree line, it suggests that the residuals are approximately normally distributed
# (even though shapiro test doesn't prove it)
import matplotlib.pyplot as plt
res = model.resid
fig = sm.qqplot(res, stats.t, fit=True, line="45")
plt.show()
two-way ANOVA:
df sum_sq mean_sq F PR(>F)
C(Tiredness) 2.0 4.422222 2.211111 1.013771 0.367244
C(Breakfast) 1.0 6.854180 6.854180 3.142568 0.079899
C(Tiredness):C(Breakfast) 2.0 0.135412 0.067706 0.031043 0.969445
Residual 84.0 183.210407 2.181076 NaN NaN
Shapiro test
ShapiroResult(statistic=0.9766788482666016, pvalue=0.10531754046678543)
import seaborn as sns
import matplotlib.image as mpimg
scatter_plot = sns.lmplot(x="Tiredness_value", y="Hours", data=equal_samle_sized_df, fit_reg=False, hue='Breakfast', markers=["o", "x"]).set(title='The relation between being tired and hours of sleep')
image = mpimg.imread('../sleeping_beauty.png')
plt.imshow(image, zorder=0, extent=[0.0, 6.0, 2.0, 11.0], aspect='auto', alpha=0.3)
plt.show()
fig = plt.figure(figsize=(10, 8))
ax = sns.boxplot(x='Tiredness_value_answer', y='Hours', data=equal_samle_sized_df, color='#ffccff').set(title='Sleep duration categorized by tiredness factor')
ax = sns.swarmplot(x="Tiredness_value_answer", y="Hours", data=equal_samle_sized_df, color='#660066', size=4)
plt.show()